library(readxl)
library(DT)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(cowplot)
library(curl)
library(biomaRt)
library(sqldf)
# *** susieR ****
# library(knitrBootstrap) #install_github('jimhester/knitrBootstrap')
library(susieR) # devtools::install_github("stephenslab/susieR")
# *** finemapr ****
## finemapr contains: finemap, CAVIAR, and PAINTOR
#library(finemapr) # devtools::install_github("variani/finemapr")
library(roxygen2)#roxygenize()
# *** locuscomparer ****
# https://github.com/boxiangliu/locuscomparer
library(locuscomparer); #devtools::install_github("boxiangliu/locuscomparer")
# thm <- knitr::knit_theme$get("bipolar")
# knitr::knit_theme$set(thm)
sessionInfo()## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] locuscomparer_1.0.0 roxygen2_6.1.1 susieR_0.6.2.0411
## [4] sqldf_0.4-11 RSQLite_2.1.1 gsubfn_0.7
## [7] proto_1.0.0 biomaRt_2.38.0 curl_3.3
## [10] cowplot_0.9.4 plotly_4.8.0 ggplot2_3.1.0
## [13] dplyr_0.7.8 data.table_1.12.0 DT_0.5.1
## [16] readxl_1.2.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.0 lattice_0.20-38 tidyr_0.8.2
## [4] prettyunits_1.0.2 assertthat_0.2.0 digest_0.6.18
## [7] R6_2.3.0 cellranger_1.1.0 plyr_1.8.4
## [10] chron_2.3-53 stats4_3.5.1 evaluate_0.12
## [13] httr_1.4.0 pillar_1.3.1 rlang_0.3.1
## [16] progress_1.2.0 lazyeval_0.2.1 blob_1.1.1
## [19] S4Vectors_0.20.1 Matrix_1.2-15 rmarkdown_1.11
## [22] stringr_1.3.1 htmlwidgets_1.3 RCurl_1.95-4.11
## [25] bit_1.1-14 munsell_0.5.0 compiler_3.5.1
## [28] xfun_0.4 pkgconfig_2.0.2 BiocGenerics_0.28.0
## [31] htmltools_0.3.6 tcltk_3.5.1 tidyselect_0.2.5
## [34] expm_0.999-3 tibble_2.0.1 IRanges_2.16.0
## [37] XML_3.98-1.16 viridisLite_0.3.0 crayon_1.3.4
## [40] withr_2.1.2 commonmark_1.7 bitops_1.0-6
## [43] grid_3.5.1 jsonlite_1.6 gtable_0.2.0
## [46] DBI_1.0.0 magrittr_1.5 scales_1.0.0
## [49] stringi_1.2.4 bindrcpp_0.2.2 xml2_1.2.0
## [52] tools_3.5.1 bit64_0.9-7 Biobase_2.42.0
## [55] glue_1.3.0 purrr_0.3.0 hms_0.4.2
## [58] parallel_3.5.1 yaml_2.2.0 AnnotationDbi_1.44.0
## [61] colorspace_1.4-0 memoise_1.1.0 knitr_1.21
## [64] bindr_0.1.1
print(paste("susieR ", packageVersion("susieR")))## [1] "susieR 0.6.2.411"
createDT <- function(DF, caption="", scrollY=400){
data <- DT::datatable(DF, caption=caption,
extensions = 'Buttons',
options = list( dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = F,
columnDefs = list(list(className = 'dt-center', targets = "_all"))
)
)
return(data)
}
createDT_html <- function(DF, caption="", scrollY=400){
print( htmltools::tagList( createDT(DF, caption, scrollY)) )
} Use bioMart to get TSS positions for each gene
get_TSS_position <- function(gene){
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
# att <- listAttributes(mart)
# grep("transcription", att$name, value=TRUE)
TSS <- getBM(mart=mart,
attributes=c("hgnc_symbol","transcription_start_site", "version"),
filters="hgnc_symbol", values=gene)
} For each gene, get the position of top SNP (the one with the greatest effect size/Beta)
## Only the significant subset of results
Nalls_SS_sig <- read_excel("Data/Nalls2018_S2_SummaryStats.xlsx", sheet = "Data")
top_SNPs <- Nalls_SS_sig %>% group_by(`Nearest Gene`) %>% top_n(1, `Beta, all studies`)
# lapply(Nalls_SS, function(x){print(paste(length(unique(x))))}) %>% data.frame() %>% t()
createDT(Nalls_SS_sig, caption = "Nalls (2018): Table S2. Detailed summary statistics on all nominated risk variants, known and novel")# Nall (2018) QTL Stats
# Nalls_QTL <- read_excel("Data/Nalls2018_S4_QTL-MR.xlsx")
# createDT(Nalls_QTL, caption = "Nalls (2018) Table S4. Expanded summary statistics for QTL Mendelian randomization") Nalls et al. (2019) data: https://github.com/neurogenetics/meta5
get_flanking_SNPs <- function(gene, top_SNPs, bp_distance=1000000){
topSNP_sub <- subset(top_SNPs, `Nearest Gene`==gene)
## Full dataset
filePath <- "Data/META.PD.NALLS2014.PRS.tsv"# "Data/nallsEtal2019_no23andMe.tab.txt"
# Get col names
# strsplit( readLines(filePath, n = 2), "\t")
minPos <- topSNP_sub$BP - bp_distance
maxPos <- topSNP_sub$BP + bp_distance
geneSubset <- read.csv.sql(filePath, sep="\t", stringsAsFactors=F,
sql = paste('select * from file where CHR =',topSNP_sub$CHR,
'AND POS BETWEEN', minPos, 'AND', maxPos))
geneSubset <- geneSubset %>% dplyr::rename(SNP=MarkerName)
return(geneSubset)
} The I^2 statistic describes the percentage of variation across studies that seems not to be due to chance.
All input variants must be on the same chromosome and match a bi-allelic variant
get_LDmatrix <- function(rs_list, token="df4298d58dc4",
population_list=c("CEU","TSI","FIN","GBR","IBS"),
R2_or_D="r2"){
populations <- ifelse(length(population_list)==1, population_list,
paste(population_list, collapse="%2B"))
rs_IDs <- paste(rs_list, collapse="%0A")
con <-curl(paste("https://ldlink.nci.nih.gov/LDlinkRest/ldmatrix?snps=", rs_IDs,
"&pop=",populations,
"&r2_d=",R2_or_D,
"&token=",token,
sep=""))
open(con)
LD_matrix <- read.delim(con, header = T, row.names = "RS_number" )
LD_matrix <- LD_matrix %>% data.matrix(rownames.force = T)
LD_matrix[is.na(LD_matrix)] <- 0 # | LD_matrix<0
close(con)
return(LD_matrix)
}
susie_on_gene <- function(gene, top_SNPs, SNP_limit=300){
geneSubset <- get_flanking_SNPs(gene, top_SNPs)
geneSubset <- geneSubset%>% arrange(desc(abs(Effect)))
geneSubset <- geneSubset[1:SNP_limit,]
geneSubset$Coord <- paste(geneSubset$CHR,geneSubset$POS, sep="")
#paste("chr",geneSubset$CHR,":",geneSubset$POS, sep="")
# Subset summary stats
# cat("\n Preparing summary stats...\n")
labelSNPs <- geneSubset[1:5,]
before_plot <- ggplot(geneSubset, aes(x=Coord, y=Effect, label=SNP, color= -log(P.value) )) +
geom_hline(yintercept=0,alpha=.5, linetype=2, size=.5) +
geom_point() +
geom_point(data=labelSNPs[1,],
pch=21, fill=NA, size=4, colour="red", stroke=1) +
geom_segment(aes(xend=Coord, yend=0, color= -log(P.value)), alpha=.5) +
geom_text(data=labelSNPs, aes(label=SNP),
vjust="inward",hjust="inward", nudge_y=.05, nudge_x=.25) +
labs(title=paste(gene, "Before Fine Mapping",sep="\n"), y="Beta", x="BP Position",
color="-log(p-value)\nAll Studies") +
theme(plot.title = element_text(hjust = 0.5))
# Get LD matrix from NIH API
# cat("\n Downloading LD matrix...\n")
LD_matrix <- get_LDmatrix(geneSubset$SNP )
# Run Susie
# cat("\n Running SusieR...\n")
geneSubset_filt <- subset(geneSubset, SNP %in% row.names(LD_matrix) )
b <- geneSubset_filt$Effect
se <- geneSubset_filt$StdErr
fitted_bhat <- susie_bhat(bhat = b,
shat = se,
R = LD_matrix,
n = nrow(LD_matrix),
var_y = var(b),
L = 1, # Must lower to 1 if entering only 3 SNPs
scaled_prior_variance = 0.1,
estimate_residual_variance = TRUE,
estimate_prior_variance = FALSE,
standardize = TRUE)
summary(fitted_bhat)$cs
susieDF <- data.frame(SNP=names(fitted_bhat$X_column_scale_factors), PIP=fitted_bhat$pip) %>%
base::merge(subset(geneSubset_filt,
select=c("SNP","Effect","P.value", "Coord")), by="SNP") %>%
arrange(desc(PIP))
labelSNPs_PIP <- susieDF[1:5,]
after_plot <- ggplot(susieDF, aes(x=Coord, y=PIP, label=SNP, color= -log(P.value) )) +
geom_hline(yintercept=0,alpha=.5, linetype=2, size=.5) +
geom_point() +
geom_point(data=susieDF[susieDF$PIP == max(susieDF$PIP),],
pch=21, fill=NA, size=4, colour="green", stroke=1) +
geom_segment(aes(xend=Coord, yend=0, color= -log(P.value))) +
geom_text(data=labelSNPs_PIP, aes(label=SNP),
vjust="inward",hjust="inward",nudge_x = .25) +
labs(title=paste(gene, "After Fine Mapping",sep="\n"), y="PIP", x="BP Position",
color="-log(p-value)\nAll Studies") +
theme(plot.title = element_text(hjust = 0.5))#theme(legend.position="none")
plot_grid(before_plot, after_plot, nrow = 2) %>% print()
# susie_plot(fitted_bhat, y="PIP", b=b, add_bar = T)
createDT_html(susieDF, paste("susieR Results: ", gene), scrollY = 200)
# createDT_html(LD_matrix, paste("LD matrix from 1000 Genomes: ", gene), scrollY = 200)
return(susieDF)
} Only genes with at least 3 variants associated with them (from Nalls et al. (2019)) were included for fine mapping.
geneList <- c("LRRK2","GBAP1","SNCA","VPS13C","GCH1")#unique(top_SNPs$`Nearest Gene`)
#Nalls_SS %>% group_by(`Nearest Gene`) %>% tally() %>% subset(n>2)
fineMapped_topSNPs <- data.table()
for (gene in geneList){
cat("\n")
cat("### ", gene, "\n")
results <- susie_on_gene(gene, top_SNPs)
fineMapped_topSNPs <- rbind(fineMapped_topSNPs, subset(results, PIP==max(PIP)) %>% as.data.table())
cat("\n")
}createDT(fineMapped_topSNPs, "Potential Causal SNPs Identified by susieR", scrollY = 200)